# install.packages(c('ggplot2’, 'gcookbook’), dependencies = TRUE)
library(ggplot2)
library(gcookbook)


Making a Basic Scatter Plots

str(heightweight)
## 'data.frame':    236 obs. of  5 variables:
##  $ sex     : Factor w/ 2 levels "f","m": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ageYear : num  11.9 12.9 12.8 13.4 15.9 ...
##  $ ageMonth: int  143 155 153 161 191 171 185 142 160 140 ...
##  $ heightIn: num  56.3 62.3 63.3 59 62.5 62.5 59 56.5 62 53.8 ...
##  $ weightLb: num  85 105 108 92 112 ...
head(heightweight[, c('ageYear', 'heightIn')])
##   ageYear heightIn
## 1   11.92     56.3
## 2   12.92     62.3
## 3   12.75     63.3
## 4   13.42     59.0
## 5   15.92     62.5
## 6   14.25     62.5
p <- ggplot(heightweight, aes(x = ageYear, y = heightIn))
p + geom_point(shape = 21)

p + geom_point(size = 1.5)


Grouping Data Points by a Variable Using Shape or Color

head(heightweight[, c('sex', 'ageYear', 'heightIn')])
##   sex ageYear heightIn
## 1   f   11.92     56.3
## 2   f   12.92     62.3
## 3   f   12.75     63.3
## 4   f   13.42     59.0
## 5   f   15.92     62.5
## 6   f   14.25     62.5
p <- ggplot(heightweight, aes(x = ageYear, y = heightIn, shape = sex, color = sex))
p + geom_point()

p + geom_point() + scale_shape_manual(values = c(1, 2)) +
  scale_color_brewer(palette = 'Set1')


Using Different Point Shapes

ggplot(heightweight, aes(x = ageYear, y = heightIn)) + geom_point(shape = 3)

ggplot(heightweight, aes(x = ageYear, y = heightIn, shape = sex)) +
  geom_point(size = 4) + scale_shape_manual(values = c(1, 4))


hw <- heightweight
hw$weightGroup <- cut(hw$weightLb, breaks = c(-Inf, 100, Inf),
                      labels = c('< 100', '>=100'))
ggplot(hw, aes(x = ageYear, y = heightIn, shape = sex, fill = weightGroup)) +
  geom_point(size = 2.5) +
  scale_shape_manual(values = c(21, 24)) +
  scale_fill_manual(values = c(alpha('black', 0), 'black'), 
                    guide = guide_legend(override.aes = list(shape = 21)))


Mapping a Continuous Variable to Color or Size

ggplot(heightweight, aes(x = ageYear, y = heightIn, color = weightLb)) + 
    geom_point()

ggplot(heightweight, aes(x = ageYear, y = heightIn, size = weightLb)) + 
geom_point()


ggplot(heightweight, aes(x = ageYear, y = heightIn, fill = weightLb)) +
  geom_point(shape = 21, size = 2.5) +
  scale_fill_gradient(low = 'black', high = 'white')

ggplot(heightweight, aes(x = ageYear, y = heightIn, fill = weightLb)) +
  geom_point(shape = 21, size = 2.5) +
  scale_fill_gradient(low = 'black', high = 'white', guide = guide_legend())


ggplot(heightweight, aes(x = ageYear, y = heightIn, size = weightLb, color = sex)) +
  geom_point(alpha = 0.5) +
  scale_size(range = c(1, 6)) +
  scale_color_brewer(palette = 'Set1')


Adding Fitted Regression Model Lines

sp <- ggplot(heightweight, aes(x = ageYear, y = heightIn))
sp + geom_point() + stat_smooth(method = lm)
## `geom_smooth()` using formula 'y ~ x'

sp + geom_point() + stat_smooth(method = lm, level = 0.99)
## `geom_smooth()` using formula 'y ~ x'

sp + geom_point() + stat_smooth(method = lm, se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

sp + geom_point() + stat_smooth() # default : method = loess
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'


sps <- ggplot(heightweight, aes(x = ageYear, y = heightIn, color = sex)) + 
  geom_point() + scale_color_brewer(palette = 'Set1')
sps + stat_smooth(method = lm)
## `geom_smooth()` using formula 'y ~ x'

sps + stat_smooth(method = lm, fullrange = TRUE)
## `geom_smooth()` using formula 'y ~ x'


Adding Annotations with Model Coefficients

model <- lm(heightIn ~ ageYear, data = heightweight)
summary(model)
## 
## Call:
## lm(formula = heightIn ~ ageYear, data = heightweight)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.3517 -1.9006  0.1378  1.9071  8.3371 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.4356     1.8281   20.48   <2e-16 ***
## ageYear       1.7483     0.1329   13.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.989 on 234 degrees of freedom
## Multiple R-squared:  0.4249, Adjusted R-squared:  0.4225 
## F-statistic: 172.9 on 1 and 234 DF,  p-value: < 2.2e-16
sp <- ggplot(heightweight, aes(x = ageYear, y = heightIn)) + geom_point() +
  stat_smooth(method = lm)
sp + annotate('text', label = 'r^2==0.42', parse = TRUE, x = 16.5, y = 52)
## `geom_smooth()` using formula 'y ~ x'

est <- unname(coef(model))
eqn <- as.character(as.expression(
  substitute(italic(y) == a + b * italic(x) * ',' ~~ italic(r)^2 ~ '=' ~ r2,
             list(a = format(est[1], digits = 3),
                  b = format(est[2], digits = 3),
                  r2 = format(summary(model)$r.squared, digits = 2)
))))
parse(text = eqn)
## expression(italic(y) == "37.4" + "1.75" * italic(x) * "," ~ ~italic(r)^2 ~ 
##     "=" ~ "0.42")
sp + annotate('text', label = eqn, parse = TRUE, x = Inf, y = -Inf, hjust = 1.1, vjust = -0.5)
## `geom_smooth()` using formula 'y ~ x'


Adding Marginal Rugs to a Scatter Plot

ggplot(faithful, aes(x = eruptions, y = waiting)) + geom_point() + geom_rug()

ggplot(faithful, aes(x = eruptions, y = waiting)) + geom_point() + geom_rug(position = 'jitter', size = 0.2)


Labeling Points in a Scatter Plots

cs <- subset(countries, Year == 2009 & healthexp > 2000)
str(cs)
## 'data.frame':    27 obs. of  7 variables:
##  $ Name        : Factor w/ 216 levels "Afghanistan",..: 5 11 12 19 34 53 66 67 72 75 ...
##  $ Code        : Factor w/ 216 levels "ABW","AFG","AGO",..: 5 11 12 15 33 54 63 65 51 77 ...
##  $ Year        : int  2009 2009 2009 2009 2009 2009 2009 2009 2009 2009 ...
##  $ GDP         : num  NA 42131 45555 43640 39599 ...
##  $ laborrate   : num  NA 65.2 60.4 53.5 67.8 65.4 60.9 56.1 59.8 53.7 ...
##  $ healthexp   : num  3090 3867 5037 5104 4380 ...
##  $ infmortality: num  3.1 4.2 3.6 3.6 5.2 3.4 2.5 3.5 3.5 3.5 ...
sp <- ggplot(cs, aes(x = healthexp, y = infmortality)) + geom_point()
sp + geom_text(aes(label = Name), size = 4)

sp + geom_text(aes(label = Name), size = 4, vjust = -1)

sp + geom_text(aes(label = Name), size = 4, hjust = -0.1)


sp + annotate('text', x = 4350, y = 5.4, label = 'Canada') +
  annotate('text', x = 7400, y = 6.8, label = 'USA')


cs$Name1 <- cs$Name
idx <- cs$Name1 %in% c('Canada', 'Ireland', 'United Kingdom', 'United States', 'Japan', 
                       'New Zealand', 'Iceland', 'Luxembourg', 'Netherland', 'Switzerland')
cs$Name1[!idx] <- NA
ggplot(cs, aes(x = healthexp, y = infmortality, label = Name1)) + geom_point() + 
  geom_text(aes(x = healthexp + 100, label = Name1), size = 4, hjust = 0) +
  xlim(2000, 9000)
## Warning: Removed 18 rows containing missing values (geom_text).


Creating a Balloon Plot

cdat <- subset(countries, Year == 2009 &
               Name %in% c('Canada', 'Ireland', 'United Kingdom', 'United States',
                           'New Zealand', 'Iceland', 'Japan', 'Luxembourg',
                           'Netherland', 'Switzerland'))
p <- ggplot(cdat, aes(x = healthexp, y = infmortality, size = GDP)) +
  geom_point(shape = 21, color = 'black', fill = 'cornsilk')
p

p + scale_size_area(max_size = 15)

p + scale_size(range = c(3, 10))